In [5]:
# set up
import pandas as pd
import numpy as np
from collections import defaultdict

# models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
# nlp
import nltk
from nltk.tokenize import word_tokenize # nltk tokenizer
nltk_stopwords = nltk.corpus.stopwords.words('english') # nltk stop words
from nltk.util import ngrams
# sys
import warnings
warnings.filterwarnings('ignore')
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

Overview

cluster reviewers based on their review text, and gain the insights from each cluster to understand the perception of price, quality, and value for each product for each cluster of customers.

Data Preprocessing

In [6]:
# read from pickle stored from previous preprocessing work
df = pd.read_pickle("df_all.pkl")
In [4]:
df.shape
Out[4]:
(512821, 25)
In [7]:
df = df.dropna(how = 'all',subset = ["Sound Bite Text"]) # drop rows with review text missing
In [8]:
# drop rows with review text deleted
df.drop(df[(df["Sound Bite Text"] == "Post deleted by the author.")].index, inplace = True)
In [9]:
df.shape
Out[9]:
(321344, 25)
In [11]:
# resample the data using stratification on the product class
from sklearn.utils import resample
df_new = resample(df, n_samples = 2000, replace=False, random_state=0)

# df_stratified = df.loc[products_new.index]
In [12]:
len(df_new)
Out[12]:
2000

Clustering

Choose the optimal k

Hypothesis: from previous EDA for titles we know that topics are mostly around the products, and we know that there are three products: Samsung Galaxy, Iphone x, Iphone 8. There are three dimensions that reviewers care about price, quality, and value. So based on these two assumptions the clusters should be 6 to 9.

In [13]:
# test the k clusters on the subset
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_new["Sound Bite Text"])
In [14]:
# find out the optimal clusters
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init=1)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)
In [15]:
# plot the sum of squared distances
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.title('Elbow Method For Optimal k')
plt.show()

The plot confirms with our previous assumption that the optimal k lies between 6 and 9 as the marginal decrement of the sum squared distance slows down at this point. So let's set k = 6.

kmeans clustering to cluster review text

In [16]:
# pipiline
text_cluster = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
                      ('tfidf', TfidfTransformer()),
                      ('km', KMeans(n_clusters = 6, init='k-means++', max_iter=1000, n_init=1))])
In [17]:
text_cluster.fit(df_new["Sound Bite Text"])
Out[17]:
Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('km',
                 KMeans(algorithm='auto', copy_x=True, init='k-means++',
                        max_iter=1000, n_clusters=6, n_init=1, n_jobs=None,
                        precompute_distances='auto', random_state=None,
                        tol=0.0001, verbose=0))],
         verbose=False)
In [18]:
# vectorize data with tf-idf transformer
documents = df["Sound Bite Text"]
# vector transform and remvoe stop words
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
In [19]:
# save to pickle
from sklearn.externals import joblib
# dump to pickle
joblib.dump(X, 'x_vector.pkl')
Out[19]:
['x_vector.pkl']
In [20]:
# load from pickel
X = joblib.load('x_vector.pkl')
In [21]:
X.shape
Out[21]:
(321344, 267090)
In [85]:
# train model and choose k
true_k = 6
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, n_init=1)
model.fit(X)
%time
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs
In [22]:
# dump to pickle
# joblib.dump(model, 'model.pkl')

# and reload from pickle
model = joblib.load('model.pkl')
In [24]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

# create dict for clusters
from collections import defaultdict 
clusters = defaultdict(list)
for i in range(6):
    print("===========")
    print("Cluster %d:" % i)
    print("===========")
    for ind in order_centroids[i, : 10]:
        print(' %s' % terms[ind])
    for ind in order_centroids[i, : 25]:
        clusters[i].append(terms[ind])    
    
Top terms per cluster:
===========
Cluster 0:
===========
 apple
 iphone
 new
 plus
 charging
 watch
 wireless
 ios
 oled
 year
===========
Cluster 1:
===========
 twitter
 pic
 com
 rt
 https
 s8
 galaxy
 samsung
 iphone
 dlvr
===========
Cluster 2:
===========
 iphone
 plus
 apple
 new
 phone
 just
 like
 camera
 pixel
 iphone8
===========
Cluster 3:
===========
 s8
 galaxy
 samsung
 phone
 android
 new
 google
 http
 screen
 lg
===========
Cluster 4:
===========
 s8
 galaxy
 samsung
 bixby
 new
 phone
 plus
 note
 smartphone
 display
===========
Cluster 5:
===========
 iphone
 plus
 apple
 new
 charging
 7s
 camera
 buy
 wireless
 glass

The top 10 keywords from each clusters tell us certain features for the price, quality, and value dimensions for each product.

Cluster one: apple's iphone 8 plus's new charging, wireless, and ios, watch

Cluster two: twitter samsung, and iphone

Cluster three: iphone plus's camera and comparison to pixel

Cluster four: galaxy s8, android, google, screen, and comparison to LG

Cluster five: galaxy s8, bixby, display

Cluster six: apple's iphone's new charging, camera, wireless, and comparison to 7s.

In [89]:
model.labels_.shape
Out[89]:
(321344,)

What feature of each product each reviewer cluster cares about

In [25]:
df["clusters"] = model.labels_
In [26]:
df.drop(columns = ["Post Type", "Media Type", "Author ID", "Author Name","Quoted Post",'Quoted Author Name', 'Quoted Author Handle',
       'Total Engagements', 'Post Comments', 'Post Likes', 'Post Shares',
       'Product Name'], inplace = True)
In [27]:
a = df.groupby("clusters")

Cluster 1

In [28]:
cluster1 = a.get_group(0)
In [29]:
no_1 = " ".join([i for i in list(cluster1["Negative Objects"].dropna())])
# tokenize and preprocess tokens
tokens_no1 = word_tokenize(no_1)
tokens_no1 = [token.lower() for token in tokens_no1 if token.isalpha()]
tokens_no1 = [token for token in tokens_no1 if token.isalpha()]
tokens_no1 = [token for token in tokens_no1 if token not in nltk_stopwords]

# to string
no1 = " ".join([i for i in tokens_no1])
In [30]:
len(tokens_no1)
Out[30]:
11445
In [31]:
def countToken(token_list):
    dct = defaultdict(int)
    for token in token_list:
        if token not in dct:
            dct[token] = 1
        else:
            dct[token] += 1
    return dct
In [44]:
d = countToken(tokens_no1)
key = sorted(d, key = d.get, reverse = True)


# visualization
plt.figure(figsize = (20,4))
height = [d[k] for k in key][:20]
bars = key[:20]
y_pos = np.arange(len(bars))

plt.bar(y_pos, height, color=('tab:cyan'))
plt.xticks(y_pos, bars)
plt.show()
In [154]:
no1 = " ".join([i for i in tokens_no1])
In [155]:
no1 = no1.lower()
In [156]:
# Create the wordcloud object
wordcloud = WordCloud(width = 400, height = 200,margin = 0,
                      background_color = 'white', colormap = 'Set2',max_words=200000, scale = 10).generate(no1)
plt.figure(figsize = (15,30))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig("wordcloud_cluster1.png")
plt.show()
In [45]:
po_1 = " ".join([i for i in list(cluster1["Positive Objects"].dropna())])
tokens_po_1 = word_tokenize(po_1)
tokens_po_1 = [token for token in tokens_po_1 if token.isalpha()]
tokens_po_1 = [token for token in tokens_po_1 if token not in nltk_stopwords]

po_1 = " ".join([i for i in tokens_po_1])
po_1 = po_1.lower()
In [48]:
d = countToken(tokens_po_1)
key = sorted(d, key = d.get, reverse = True)


# visualization
plt.figure(figsize = (19,4))
height = [d[k] for k in key][:20]
bars = key[:20]
y_pos = np.arange(len(bars))

plt.bar(y_pos, height, color=('tab:cyan'))
plt.xticks(y_pos, bars)
plt.show()
In [158]:
# Create the wordcloud object
wordcloud = WordCloud(width = 400, height = 200,margin = 0,
                      background_color = 'white', colormap = 'Set2',max_words=200000, scale = 10).generate(po_1)
plt.figure(figsize = (15,30))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig("wordcloud_cluster1_po.png")
plt.show()

Cluster 2

In [52]:
cluster2 = a.get_group(1)
In [53]:
no = " ".join([i for i in list(cluster2["Negative Objects"].dropna())])
In [54]:
po = " ".join([i for i in list(cluster2["Positive Objects"].dropna())])

negative objects vs positive objects

In [49]:
def genToken(corpus):
    
    # preprocess str corpus into tokens
    tokens_no1 = word_tokenize(corpus)
    tokens_no1 = [token.lower() for token in tokens_no1 if token.isalpha()]
    tokens_no1 = [token for token in tokens_no1 if token.isalpha()]
    tokens_no1 = [token for token in tokens_no1 if token not in nltk_stopwords]

    # to string
    no1 = " ".join([i for i in tokens_no1])
    
    return tokens_no1, no1
In [56]:
no_token,no_str = genToken(no)
In [57]:
po_token,po_str = genToken(po)
In [59]:
def toBar(tokens):
    d = countToken(tokens)
    key = sorted(d, key = d.get, reverse = True)


    # visualization for top 20 key words
    plt.figure(figsize = (19,4))
    height = [d[k] for k in key][:20]
    bars = key[:20]
    y_pos = np.arange(len(bars))

    plt.bar(y_pos, height, color=('tab:cyan'))
    plt.xticks(y_pos, bars)
    plt.show()
In [60]:
toBar(no_token)
In [61]:
toBar(po_token)
In [62]:
def toCloud(cleanStr):
    # Create the wordcloud object
    wordcloud = WordCloud(width = 400, height = 200,margin = 0,
                          background_color = 'white', colormap = 'Set2',max_words=200000, scale = 10).generate(cleanStr)
    plt.figure(figsize = (15,30))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
In [63]:
toCloud(no_str)
In [64]:
toCloud(po_str)

Cluster 3

In [65]:
cluster3 = a.get_group(2)
no = " ".join([i for i in list(cluster3["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster3["Positive Objects"].dropna())])
In [66]:
no_token,no_str = genToken(no)
po_token,po_str = genToken(po)
In [67]:
toBar(no_token)
toBar(po_token)
In [68]:
toCloud(no_str)
toCloud(po_str)

Cluster 4

In [70]:
cluster3 = a.get_group(3)
no = " ".join([i for i in list(cluster3["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster3["Positive Objects"].dropna())])

no_token,no_str = genToken(no)
po_token,po_str = genToken(po)

# to bar 
toBar(no_token)
toBar(po_token)

# to word cloud
toCloud(no_str)
toCloud(po_str)

Cluster 5

In [71]:
cluster5 = a.get_group(4)
no = " ".join([i for i in list(cluster5["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster5["Positive Objects"].dropna())])

no_token,no_str = genToken(no)
po_token,po_str = genToken(po)

# to bar 
toBar(no_token)
toBar(po_token)

# to word cloud
toCloud(no_str)
toCloud(po_str)

Cluster 6

In [72]:
cluster6 = a.get_group(5)
no = " ".join([i for i in list(cluster6["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster6["Positive Objects"].dropna())])

no_token,no_str = genToken(no)
po_token,po_str = genToken(po)

# to bar 
toBar(no_token)
toBar(po_token)

# to word cloud
toCloud(no_str)
toCloud(po_str)
In [ ]: